#!/usr/bin/python3

"""
This script reads the disassembled.code files in the malicous_apk
and benign_apk folders and copies the Android and Java methods called
into a JSON file for later analysis.

The output data format is as follows:
{"features": ["java/lang/String.length", ...],
 "apps": {"999eca2457729e371355aea5faa38e14.apk": {"vector": [0,0,0,1], "malicious": [0,1]}, ...}}
"""

import os
import json
import glob

__author__='mwleeds'

def main():
    all_methods = [] # list of strings naming each method used in the dataset
    app_method_map = {} # mapping from android app names to lists of methods
    app_malicious_map = {} # mapping from android app names to 1 or 0 for malware or goodware
    root_dir = os.getcwd()
    for i, directory in enumerate(['benign_apk', 'malicious_apk']):
        os.chdir(directory)
        category_root_dir = os.getcwd()
        for filename in glob.glob('*.apk'):
            try:
                print('Processing ' + filename)
                os.chdir(filename[:-4])
                with open('disassembled.code') as disassembled_code:
                    app_name = filename
                    # make a one-hot bit vector of length 2. 1st bit set if malicious, otherwise 2nd bit
                    app_malicious_map[app_name] = [1,0] if i else [0,1]
                    # parse the file and record any interesting methods
                    app_method_map[app_name] = []
                    for line in disassembled_code.readlines():
                        try:
                            method = line.split('// Method ')[1].split(':')[0]
                            #if not method.startswith('java') and not method.startswith('android'):
                            if not method.startswith('java'):
                                continue
                            # Comment the below line to use methods rather than classes
                            method = method.split('.')[0]
                            # the method is probably obfuscated; ignore it
                            if len(method.split('/')[-1]) < 4 or len(method.split('/')[-2]) == 1:
                                continue
                            if method not in all_methods:
                                all_methods.append(method)
                            if method not in app_method_map[app_name]:
                                app_method_map[app_name].append(method)
                        except IndexError:
                            continue
            except FileNotFoundError as e:
                print(e)
            finally:
                os.chdir(category_root_dir)
        os.chdir(root_dir)
    all_apps = {} # mapping combining app_methods_map and app_malicious_map using bits
    for app_name in app_method_map:
        bit_vector = [1 if m in app_method_map[app_name] else 0 for m in all_methods]
        all_apps[app_name] = {'vector': bit_vector, 'malicious': app_malicious_map[app_name]}
    with open('app_method_vectors.json', 'w') as outfile:
        json.dump({'features': all_methods, 'apps': all_apps}, outfile)
    print('Wrote data on ' + str(len(all_methods)) + ' methods and ' + str(len(all_apps)) + ' apps to a file.')

if __name__=='__main__':
    main()
